doctra 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/__init__.py +4 -0
- doctra/cli/main.py +168 -0
- doctra/engines/image_restoration/__init__.py +10 -0
- doctra/engines/image_restoration/docres_engine.py +566 -0
- doctra/engines/vlm/service.py +0 -12
- doctra/parsers/enhanced_pdf_parser.py +370 -0
- doctra/parsers/structured_pdf_parser.py +11 -60
- doctra/parsers/table_chart_extractor.py +8 -44
- doctra/third_party/docres/data/MBD/MBD.py +110 -0
- doctra/third_party/docres/data/MBD/MBD_utils.py +291 -0
- doctra/third_party/docres/data/MBD/infer.py +151 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/aspp.py +95 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/__init__.py +13 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/drn.py +402 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/mobilenet.py +151 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/resnet.py +170 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/xception.py +288 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/decoder.py +59 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/deeplab.py +81 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/__init__.py +12 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/batchnorm.py +282 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/comm.py +129 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/replicate.py +88 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/unittest.py +29 -0
- doctra/third_party/docres/data/preprocess/crop_merge_image.py +142 -0
- doctra/third_party/docres/inference.py +370 -0
- doctra/third_party/docres/models/restormer_arch.py +308 -0
- doctra/third_party/docres/utils.py +464 -0
- doctra/ui/app.py +5 -32
- doctra/utils/progress.py +13 -98
- doctra/utils/structured_utils.py +45 -49
- doctra/version.py +1 -1
- {doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/METADATA +1 -1
- doctra-0.4.0.dist-info/RECORD +67 -0
- doctra-0.3.2.dist-info/RECORD +0 -44
- {doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/WHEEL +0 -0
- {doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/top_level.txt +0 -0
doctra/__init__.py
CHANGED
@@ -4,13 +4,17 @@ Parse, extract, and analyze documents with ease
|
|
4
4
|
"""
|
5
5
|
|
6
6
|
from .parsers.structured_pdf_parser import StructuredPDFParser
|
7
|
+
from .parsers.enhanced_pdf_parser import EnhancedPDFParser
|
7
8
|
from .parsers.table_chart_extractor import ChartTablePDFParser
|
9
|
+
from .engines.image_restoration import DocResEngine
|
8
10
|
from .version import __version__
|
9
11
|
from .ui import build_demo, launch_ui
|
10
12
|
|
11
13
|
__all__ = [
|
12
14
|
'StructuredPDFParser',
|
15
|
+
'EnhancedPDFParser',
|
13
16
|
'ChartTablePDFParser',
|
17
|
+
'DocResEngine',
|
14
18
|
'build_demo',
|
15
19
|
'launch_ui',
|
16
20
|
'__version__'
|
doctra/cli/main.py
CHANGED
@@ -15,12 +15,14 @@ from typing import Optional
|
|
15
15
|
# Import parsers
|
16
16
|
try:
|
17
17
|
from doctra.parsers.structured_pdf_parser_enhancer import StructuredPDFParser
|
18
|
+
from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
|
18
19
|
from doctra.parsers.chart_table_pdf_parser import ChartTablePDFParser
|
19
20
|
except ImportError:
|
20
21
|
# Fallback for development/testing
|
21
22
|
project_root = Path(__file__).parent.parent.parent
|
22
23
|
sys.path.insert(0, str(project_root))
|
23
24
|
from doctra.parsers.structured_pdf_parser import StructuredPDFParser
|
25
|
+
from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
|
24
26
|
from doctra.parsers.table_chart_extractor import ChartTablePDFParser
|
25
27
|
|
26
28
|
|
@@ -37,6 +39,7 @@ def cli(ctx):
|
|
37
39
|
\b
|
38
40
|
Commands:
|
39
41
|
parse Full document parsing with text, tables, charts, and figures
|
42
|
+
enhance Enhanced parsing with DocRes image restoration
|
40
43
|
extract Extract only charts and/or tables from documents
|
41
44
|
visualize Visualize layout detection results
|
42
45
|
analyze Quick document analysis without processing
|
@@ -45,6 +48,7 @@ def cli(ctx):
|
|
45
48
|
\b
|
46
49
|
Examples:
|
47
50
|
doctra parse document.pdf # Full document parsing
|
51
|
+
doctra enhance document.pdf # Enhanced parsing with image restoration
|
48
52
|
doctra extract charts document.pdf # Extract only charts
|
49
53
|
doctra extract both document.pdf --use-vlm # Extract charts & tables with VLM
|
50
54
|
doctra visualize document.pdf # Visualize layout detection
|
@@ -275,6 +279,153 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
|
|
275
279
|
os.chdir(original_cwd)
|
276
280
|
|
277
281
|
|
282
|
+
@cli.command()
|
283
|
+
@click.argument('pdf_path', type=click.Path(exists=True, path_type=Path))
|
284
|
+
@click.option('--output-dir', '-o', type=click.Path(path_type=Path),
|
285
|
+
help='Output directory (default: outputs/{pdf_filename}_enhanced)')
|
286
|
+
@click.option('--restoration-task', type=click.Choice(['dewarping', 'deshadowing', 'appearance', 'deblurring', 'binarization', 'end2end']),
|
287
|
+
default='appearance', help='DocRes restoration task (default: appearance)')
|
288
|
+
@click.option('--restoration-device', type=click.Choice(['cuda', 'cpu']),
|
289
|
+
help='Device for DocRes processing (default: auto-detect)')
|
290
|
+
@click.option('--restoration-dpi', type=int, default=200,
|
291
|
+
help='DPI for restoration processing (default: 200)')
|
292
|
+
@vlm_options
|
293
|
+
@layout_options
|
294
|
+
@ocr_options
|
295
|
+
@click.option('--box-separator', default='\n',
|
296
|
+
help='Separator between text boxes in output (default: newline)')
|
297
|
+
@click.option('--verbose', '-v', is_flag=True,
|
298
|
+
help='Enable verbose output')
|
299
|
+
def enhance(pdf_path: Path, output_dir: Optional[Path], restoration_task: str,
|
300
|
+
restoration_device: Optional[str], restoration_dpi: int,
|
301
|
+
use_vlm: bool, vlm_provider: str, vlm_model: Optional[str], vlm_api_key: Optional[str],
|
302
|
+
layout_model: str, dpi: int, min_score: float,
|
303
|
+
ocr_lang: str, ocr_psm: int, ocr_oem: int, ocr_config: str,
|
304
|
+
box_separator: str, verbose: bool):
|
305
|
+
"""
|
306
|
+
Enhanced PDF parsing with DocRes image restoration.
|
307
|
+
|
308
|
+
Performs document processing with image restoration to improve quality
|
309
|
+
before layout detection and content extraction. Particularly useful for
|
310
|
+
scanned documents, low-quality PDFs, or documents with shadows/distortion.
|
311
|
+
|
312
|
+
\b
|
313
|
+
Restoration Tasks:
|
314
|
+
appearance - General appearance enhancement (default)
|
315
|
+
dewarping - Correct document perspective distortion
|
316
|
+
deshadowing - Remove shadows from documents
|
317
|
+
deblurring - Reduce blur in document images
|
318
|
+
binarization - Convert to clean black/white text
|
319
|
+
end2end - Complete pipeline: dewarping → deshadowing → appearance
|
320
|
+
|
321
|
+
\b
|
322
|
+
Examples:
|
323
|
+
doctra enhance document.pdf
|
324
|
+
doctra enhance document.pdf --restoration-task dewarping
|
325
|
+
doctra enhance document.pdf --restoration-task end2end --restoration-device cuda
|
326
|
+
doctra enhance document.pdf --use-vlm --vlm-api-key your_key
|
327
|
+
doctra enhance document.pdf -o ./enhanced_results --restoration-dpi 300
|
328
|
+
doctra enhance document.pdf --restoration-task deshadowing # Use different restoration task
|
329
|
+
|
330
|
+
:param pdf_path: Path to the input PDF file
|
331
|
+
:param output_dir: Output directory for results (optional)
|
332
|
+
:param restoration_task: DocRes restoration task to perform
|
333
|
+
:param restoration_device: Device for DocRes processing
|
334
|
+
:param restoration_dpi: DPI for restoration processing
|
335
|
+
:param use_vlm: Whether to use VLM for enhanced extraction
|
336
|
+
:param vlm_provider: VLM provider ('gemini' or 'openai')
|
337
|
+
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
338
|
+
:param vlm_api_key: API key for VLM provider
|
339
|
+
:param layout_model: Layout detection model name
|
340
|
+
:param dpi: DPI for PDF rendering
|
341
|
+
:param min_score: Minimum confidence score for layout detection
|
342
|
+
:param ocr_lang: OCR language code
|
343
|
+
:param ocr_psm: Tesseract page segmentation mode
|
344
|
+
:param ocr_oem: Tesseract OCR engine mode
|
345
|
+
:param ocr_config: Additional Tesseract configuration
|
346
|
+
:param box_separator: Separator between text boxes in output
|
347
|
+
:param verbose: Whether to enable verbose output
|
348
|
+
:return: None
|
349
|
+
"""
|
350
|
+
validate_vlm_config(use_vlm, vlm_api_key)
|
351
|
+
|
352
|
+
if verbose:
|
353
|
+
click.echo(f"🔍 Starting enhanced PDF parsing with DocRes...")
|
354
|
+
click.echo(f" Input: {pdf_path}")
|
355
|
+
click.echo(f" Restoration task: {restoration_task}")
|
356
|
+
click.echo(f" Restoration device: {restoration_device or 'auto-detect'}")
|
357
|
+
click.echo(f" Restoration DPI: {restoration_dpi}")
|
358
|
+
if output_dir:
|
359
|
+
click.echo(f" Output: {output_dir}")
|
360
|
+
|
361
|
+
# Create enhanced parser instance
|
362
|
+
try:
|
363
|
+
if verbose:
|
364
|
+
click.echo(f"🔧 Initializing enhanced parser with DocRes...")
|
365
|
+
if use_vlm:
|
366
|
+
click.echo(f" VLM Provider: {vlm_provider}")
|
367
|
+
click.echo(f" VLM Model: {vlm_model or 'default'}")
|
368
|
+
click.echo(f" Layout Model: {layout_model}")
|
369
|
+
click.echo(f" DPI: {dpi}")
|
370
|
+
click.echo(f" OCR Language: {ocr_lang}")
|
371
|
+
else:
|
372
|
+
click.echo(f"🔧 Initializing enhanced parser with DocRes...")
|
373
|
+
if use_vlm:
|
374
|
+
click.echo(f" Using VLM: {vlm_provider}")
|
375
|
+
|
376
|
+
parser = EnhancedPDFParser(
|
377
|
+
use_image_restoration=True,
|
378
|
+
restoration_task=restoration_task,
|
379
|
+
restoration_device=restoration_device,
|
380
|
+
restoration_dpi=restoration_dpi,
|
381
|
+
use_vlm=use_vlm,
|
382
|
+
vlm_provider=vlm_provider,
|
383
|
+
vlm_model=vlm_model,
|
384
|
+
vlm_api_key=vlm_api_key,
|
385
|
+
layout_model_name=layout_model,
|
386
|
+
dpi=dpi,
|
387
|
+
min_score=min_score,
|
388
|
+
ocr_lang=ocr_lang,
|
389
|
+
ocr_psm=ocr_psm,
|
390
|
+
ocr_oem=ocr_oem,
|
391
|
+
ocr_extra_config=ocr_config,
|
392
|
+
box_separator=box_separator
|
393
|
+
)
|
394
|
+
except Exception as e:
|
395
|
+
click.echo(f"❌ Error initializing enhanced parser: {e}", err=True)
|
396
|
+
if verbose:
|
397
|
+
import traceback
|
398
|
+
click.echo(traceback.format_exc(), err=True)
|
399
|
+
sys.exit(1)
|
400
|
+
|
401
|
+
# Change to output directory if specified
|
402
|
+
original_cwd = os.getcwd()
|
403
|
+
if output_dir:
|
404
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
405
|
+
os.chdir(output_dir)
|
406
|
+
click.echo(f"📁 Output directory: {output_dir.absolute()}")
|
407
|
+
|
408
|
+
try:
|
409
|
+
# Parse the document with enhancement
|
410
|
+
click.echo(f"📄 Processing with enhancement: {pdf_path.name}")
|
411
|
+
parser.parse(str(pdf_path.absolute()), str(output_dir) if output_dir else None)
|
412
|
+
click.echo("✅ Enhanced document processing completed successfully!")
|
413
|
+
click.echo(f"📁 Output directory: {output_dir.absolute() if output_dir else 'outputs/'}")
|
414
|
+
|
415
|
+
except KeyboardInterrupt:
|
416
|
+
click.echo("\n⚠️ Processing interrupted by user", err=True)
|
417
|
+
sys.exit(130)
|
418
|
+
except Exception as e:
|
419
|
+
click.echo(f"❌ Error during enhanced parsing: {e}", err=True)
|
420
|
+
if verbose:
|
421
|
+
import traceback
|
422
|
+
click.echo(traceback.format_exc(), err=True)
|
423
|
+
sys.exit(1)
|
424
|
+
finally:
|
425
|
+
# Restore original working directory
|
426
|
+
os.chdir(original_cwd)
|
427
|
+
|
428
|
+
|
278
429
|
@cli.group(invoke_without_command=True)
|
279
430
|
@click.pass_context
|
280
431
|
def extract(ctx):
|
@@ -782,6 +933,9 @@ def info():
|
|
782
933
|
('pytesseract', 'pytesseract', 'OCR engine'),
|
783
934
|
('tqdm', 'tqdm', 'Progress bars'),
|
784
935
|
('click', 'click', 'CLI framework'),
|
936
|
+
('skimage', 'scikit-image', 'DocRes image restoration'),
|
937
|
+
('torch', 'torch', 'DocRes neural networks'),
|
938
|
+
('huggingface_hub', 'huggingface_hub', 'Hugging Face model downloads'),
|
785
939
|
]
|
786
940
|
|
787
941
|
click.echo("\nCore Dependencies:")
|
@@ -811,6 +965,7 @@ def info():
|
|
811
965
|
# Available commands
|
812
966
|
click.echo("\nAvailable Commands:")
|
813
967
|
click.echo(" 📄 parse - Full document processing (text, tables, charts, figures)")
|
968
|
+
click.echo(" ✨ enhance - Enhanced parsing with DocRes image restoration")
|
814
969
|
click.echo(" 📊 extract - Chart/table extraction only")
|
815
970
|
click.echo(" ├─ charts - Extract only charts")
|
816
971
|
click.echo(" ├─ tables - Extract only tables")
|
@@ -845,9 +1000,22 @@ def info():
|
|
845
1000
|
else:
|
846
1001
|
click.echo(" VLM_API_KEY: (not set)")
|
847
1002
|
|
1003
|
+
# DocRes information
|
1004
|
+
click.echo("\nDocRes Image Restoration:")
|
1005
|
+
try:
|
1006
|
+
from doctra.engines.image_restoration import DocResEngine
|
1007
|
+
docres = DocResEngine()
|
1008
|
+
click.echo(f" ✅ DocRes available - {len(docres.get_supported_tasks())} restoration tasks")
|
1009
|
+
click.echo(" Tasks: dewarping, deshadowing, appearance, deblurring, binarization, end2end")
|
1010
|
+
click.echo(" 📥 Models: Downloaded from Hugging Face Hub")
|
1011
|
+
except Exception as e:
|
1012
|
+
click.echo(f" ⚠️ DocRes not available - {str(e)[:50]}...")
|
1013
|
+
click.echo(" Install with: pip install scikit-image torch huggingface_hub")
|
1014
|
+
|
848
1015
|
# Usage examples
|
849
1016
|
click.echo("\n💡 Quick Start Examples:")
|
850
1017
|
click.echo(" doctra parse document.pdf # Full document parsing")
|
1018
|
+
click.echo(" doctra enhance document.pdf # Enhanced parsing with DocRes")
|
851
1019
|
click.echo(" doctra extract both document.pdf --use-vlm # Charts & tables with VLM")
|
852
1020
|
click.echo(" doctra extract charts document.pdf # Only charts")
|
853
1021
|
click.echo(" doctra extract tables document.pdf # Only tables")
|
@@ -0,0 +1,10 @@
|
|
1
|
+
"""
|
2
|
+
Image Restoration Engines
|
3
|
+
|
4
|
+
This module provides image restoration capabilities for document processing.
|
5
|
+
Currently supports DocRes for various document image restoration tasks.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from .docres_engine import DocResEngine
|
9
|
+
|
10
|
+
__all__ = ['DocResEngine']
|